# sys, file and nav packages:
import datetime as dt

# math packages:
import pandas as pd
import numpy as np
from scipy import stats
from statsmodels.distributions.empirical_distribution import ECDF

# charting:
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from matplotlib import ticker
from matplotlib import colors
import seaborn as sns
import matplotlib.gridspec as gridspec

# home brew utitilties
import resources.utility_functions as ut
import resources.abundance_classes as ac
import resources.chart_kwargs as ck

import resources.sr_ut as sut

# images and display
import base64, io, IPython
from PIL import Image as PILImage
from IPython.display import Markdown as md
from IPython.display import display, Math, Latex
import matplotlib.image as mpimg


# set some parameters:
today = dt.datetime.now().date().strftime("%Y-%m-%d")
start_date = '2020-03-01'
end_date ='2021-05-31'

sns.set_style('whitegrid')

a_fail_rate = 50

unit_label = 'p/100m'
reporting_unit = 100

# name of the output folder:
name_of_project = 'trans_report_all'

a_color = 'dodgerblue'

# set the maps
bassin_map = PILImage.open("resources/maps/survey_locations_all.jpeg")
bassin_pallette = {'rhone':'dimgray', 'aare':'salmon', 'linth':'tan', 'ticino':'steelblue', 'reuss':'purple'}

# define the feature level and components
comps = ['linth', 'rhone', 'aare', 'ticino']
comp_labels = {"linth":"Linth/Limmat", "rhone":"Rhône", 'aare':"Aare", "ticino":"Ticino/Cerisio", "reuss":"Reuss"}
comp_palette = {"Linth/Limmat":"dimgray", "Rhône":"tan", "Aare":"salmon", "Ticino/Cerisio":"steelblue", "Reuss":"purple"}

# explanatory variables:
luse_exp = ['% to buildings', '% to recreation', '% to agg', '% to woods', 'streets km', 'intersects']

# columns needed
use_these_cols = ['loc_date' ,
                  '% to buildings',
                  '% to trans', 
                  '% to recreation',
                  '% to agg',
                  '% to woods',
                  'population',
                  'river_bassin',
                  'water_name_slug',
                  'streets km',
                  'intersects',
                  'length',
                  'groupname',
                  'code'
                 ]

# these are default
top_name = ["All survey areas"]

# add the folder to the directory tree:
project_directory = ut.make_project_folder('output', name_of_project)

# get your data:
survey_data = pd.read_csv('resources/results_with_land_use_2015.csv')
river_bassins = ut.json_file_get("resources/river_basins.json")
dfBeaches = pd.read_csv("resources/beaches_with_land_use_rates.csv")
dfCodes = pd.read_csv("resources/codes_with_group_names_2015.csv")
dfDims = pd.read_csv("resources/dims_data.csv")

# set the index of the beach data to location slug
dfBeaches.set_index('slug', inplace=True)

city_map = dfBeaches['city']

# map locations to feature names
location_wname_key = dfBeaches.water_name_slug

# map water_name_slug to water_name
wname_wname = dfBeaches[['water_name_slug','water_name']].reset_index(drop=True).drop_duplicates()
wname_wname.set_index('water_name_slug', inplace=True)
        
def make_plot_with_spearmans(data, ax, n):
    """Gets Spearmans ranked correlation and make A/B scatter plot. Must proived a
    matplotlib axis object.
    """    
    sns.scatterplot(data=data, x=n, y=unit_label, ax=ax, color='black', s=30, edgecolor='white', alpha=0.6)
    corr, a_p = stats.spearmanr(data[n], data[unit_label])
    
    return ax, corr, a_p

dfCodes.set_index("code", inplace=True)

# these descriptions need to be shortened for display
dfCodes = sut.shorten_the_value(["G74", "description", "Insulation: includes spray foams"], dfCodes)
dfCodes = sut.shorten_the_value(["G940", "description", "Foamed EVA for crafts and sports"], dfCodes)
dfCodes = sut.shorten_the_value(["G96", "description", "Sanitary-pads/tampons, applicators"], dfCodes)
dfCodes = sut.shorten_the_value(["G178", "description", "Metal bottle caps and lids"], dfCodes)
dfCodes = sut.shorten_the_value(["G82", "description", "Expanded foams 2.5cm - 50cm"], dfCodes)
dfCodes = sut.shorten_the_value(["G81", "description", "Expanded foams .5cm - 2.5cm"], dfCodes)
dfCodes = sut.shorten_the_value(["G117", "description", "Expanded foams < 5mm"], dfCodes)
dfCodes = sut.shorten_the_value(["G75", "description", "Plastic/foamed polystyrene 0 - 2.5cm"], dfCodes)
dfCodes = sut.shorten_the_value(["G76", "description", "Plastic/foamed polystyrene 2.5cm - 50cm"], dfCodes)
dfCodes = sut.shorten_the_value(["G24", "description", "Plastic lid rings"], dfCodes)
dfCodes = sut.shorten_the_value(["G33", "description", "Lids for togo drinks plastic"], dfCodes)
dfCodes = sut.shorten_the_value(["G3", "description", "Plastic bags, carier bags"], dfCodes)
dfCodes = sut.shorten_the_value(["G204", "description", "Bricks, pipes not plastic"], dfCodes)

# make a map to the code descriptions
code_description_map = dfCodes.description

# make a map to the code descriptions
code_material_map = dfCodes.material

20. Shared responsibility

Research on litter transport and accumulation in the aquatic environment has come to the conclusion that rivers are primary sources of land based macro-plastics to the marine environment [GFCH+21]. However not all objects that are transported by rivers make it to ocean, suggesting that rivers and inland lakes are also sinks for a portion of the macro-plastics that are emitted. [KBK+18]

In Switzerland this places the front line of reducing litter in the water directly on the municipal and cantonal administrations [fdlCs20]. Litter is serious business in Switzerland. The first national litter survey was conducted from 2017 - 2018 [Bla18], the Swiss Litter Report used the methods defined by the Guide for Monitoring Marine Litter (The Guide) [Han13]. Over 1000 surveys were completed in twelve months covering much of the national territory.

There are provisions in the law to account for unlawful disposal, commonly known as the principle of polluter payer. According to art. 2 of the Federal Act on the Protection of the Environment (LPE) the principal of causality is defined as :

The person who initiates a measure prescribed by this Act shall bear the costs thereof. [fdlCs20]

In Switzerland the elimination of waste and the management of litter is covered under chapter four of the LPE and the Ordinance on the limitation and disposal of waste (OLED) [fs20]. Article 31b of the LPE places the responsibility for the removal of litter on the cantons:

Municipal waste, waste from roads and public sewage treatment plants, as well as waste whose holder cannot be identified (litter) or is insolvent, are disposed of by the cantons.

In reference to the LPE and OLED the federal court ruling of 2012 for the city of Bern concluded that:

In the case of urban waste in the public space, the perpetrator often cannot be identified and that, consequently, those responsible further up the chain of causation may be required to contribute to the financing. [fdlenvironnement18]

Cantons and municipalities maintain the responsibility of the elimination and management of litter because legally they are owners of the land within their boundaries. However, this ruling allows for a more realistic application of the principle of causality.

f the person responsible for the release of municipal waste into the public space cannot be identified (littering or public garbage cans), it is permissible to consider companies or persons further up the chain of causality as producers of waste and to charge disposal fees to them (e.g. fast-food companies and similar businesses, or organizers of events that generate large quantities of waste on the public space) insofar as objectively founded criteria allow it. [fc12]

The Federal courts ruling and the current law provide a common sense legal basis to attribute the responsibility for litter if the specific offender cannot be identified. The responsibility may be attributed to another person or enterprise insofar as objectively founded criteria allow it

20.1. The Challenge

The challenge is to meet the requirements of objective criteria. The method to meet the requirements needs to be robust, transparent and easily repeatable.

output = io.BytesIO()

this_image = PILImage.open("resources/images/gclosmay2020.jpeg")
this_image.thumbnail((800, 1200))
this_image.save(output, format='PNG')
encoded_string = base64.b64encode(output.getvalue()).decode()

html = '<img src="data:image/png;base64,{}"/>'.format(encoded_string)
IPython.display.HTML(html)

Obtaining objective criteria. Grand Clos, St. Gingoplh May 2020

The challenge is to extract as much information from the litter objects based on the quantity found, the properties of the object and the environmental variables in proximity of the survey location.

Obtaining objective criteria from beach litter data is further complicated because of the \(\approx\) 61’000km of rivers and 1500 lakes in Switzerland [con21].

The hydrologic conditions of rivers have an effect on the distance and direction that litter, once introduced into a river will travel. Large low density objects will most likely be transported to the next reservoir or area of reduced flow. High density objects will only be transported if the flow velocity and turbulence of the water are sufficient to keep the objects off the bottom.Once high density items enter a low velocity zone they tend to settle or sink. [SLBH19]

The results from the latest national litter survey in Switzerland suggest that objects like cigarette ends and food wrappers are at least 21% of total objects recorded in 2020 and can be attributed to human behavior within 1500m of where it was found. [ham21]

However, other objects have no definitive geographic source or no clear association with an activity in proximity to where they are found. The most common of these objects are \(\approx\) 40% of all litter items identified in 2020 -2021 [ham21]. Thus there is a clear incentive to identify litter objects that are of local origin and those that may have arrived at the survey location by some-other means.

20.2. The test

The test compares the survey results of two groups of objects under two different land use profiles. The objects tested are based on the results of the Spearmans ranked correlation test on the survey results of the most common objects with respect to land use conditions (annex 1). The results of this test were used to create two groups of objects.

  1. Objects that have multiple positive associations to land use features and one association is to buildings

  2. Objects that have few or no positive associations to land use features and no associations to buildings

The locations were grouped according to the percent of land attributed to buildings, woods and agriculture.

20.2.1. The objects

The most common objects are those objects that were either among the top ten most abundant objects or objects that were found in at least 50% of the surveys. The most common objects were first separated into two groups according to the previously cited criteria:

  1. contributed (CG): objects that have multiple positive associations to land use features and one association is to buildings

    • Cigarette ends

    • Metal bottle tops

    • Snack wrappers

    • Glass bottles and pieces

  2. distributed (DG): objects that have few or no positive associations to land use features

    • Fragmented expanded polystyrene

    • Plastic production pellets

    • Fragmented plastics

    • Cotton swabs

    • Industrial sheeting

    • Construction plastics

Note: cotton swabs are included with DG because they are usually introduced directly into a body of water after passing through a water treatment facility.

  1. other: objects that do not meet the above criteria

Distribtuion group (DG) objects

# read images
img_a = mpimg.imread('resources/codegroups/images/fragplass_dense_450_600.jpg')
img_b = mpimg.imread('resources/codegroups/images/fragfoam_450_600.jpg')
img_c = mpimg.imread('resources/codegroups/images/infrastructure_450_600.jpg')
img_d = mpimg.imread('resources/codegroups/images/gpis_450_600.jpg')

# display images
fig, ax = plt.subplots(2,2, figsize=(12,8))

axone=ax[0,0]
ut.hide_spines_ticks_grids(axone)
axone.imshow(img_a);
axone.set_title("Fragmented plastics", **ck.title_k14)

axtwo=ax[0,1]
ut.hide_spines_ticks_grids(axtwo)
axtwo.imshow(img_b);
axtwo.set_title("Fragmented foams", **ck.title_k14)
axthree=ax[1,0]
ut.hide_spines_ticks_grids(axthree)
axthree.imshow(img_c);
axthree.set_title("Construction plastics", **ck.title_k14)
axfour=ax[1,1]
ut.hide_spines_ticks_grids(axfour)
axfour.set_title("Plastic production pellets", **ck.title_k14)
axfour.imshow(img_d);



plt.tight_layout()
plt.show()
../_images/assessing_transport_7_0.png

Two objects were selected from the CG and DG groups and tested under the same conditions

  1. From the CG group: Food and tobacco (FT)

    • Cigarette ends

    • Snack Wrappers

  2. From the DG group: Fragmented foams and plastics (FP)

    • Fragmented expanded polystyrene

    • Fragmented plastics

20.2.2. The locations

The survey locations were grouped into two classes:

  1. urban: locations that have a percent of land attributed to buildings GREATER than the median

  2. rural: locations that have a percent of land attributed to buildings LESS than the median AND percent of land attributed to woods or agriculture greater than the median

The rural class had 142 surveys for 47 locations versus 158 surveys from 37 locations in the urban class.

20.2.3. The hypothesis

If there is no significant evidence that a land use feature contributes to the accumulation of an object then the distribution of that object should be \(\approx\) under all land use conditions.

Null hypothesis: there is no statistically significant difference between survey results of DG objects at rural and urban locations.

alternate hypothesis: there is a statistically significant difference between survey results of DG objects at rural and urban locations.

20.3. The data

output = io.BytesIO()
bassin_map.thumbnail((800, 1200))
bassin_map.save(output, format='PNG')
encoded_string = base64.b64encode(output.getvalue()).decode()

html = '<img src="data:image/png;base64,{}"/>'.format(encoded_string)
IPython.display.HTML(html)

The survey locations 2020-2021: In total 385 litter-surveys were conducted touching 77 municipalities with a combined population of 1,735,000.

# collect all the data and transform
a_data = survey_data.copy()

# save adated for testing
a_dated = a_data[(a_data.date >= start_date)&(a_data.date <= end_date)].copy()

# format columns and slice by date
a_data['loc_date']=tuple(zip(a_data.location, a_data.date))
a_data['date']=pd.to_datetime(a_data['date'], format='%Y-%m-%d')
a_data = a_data[(a_data.date >= start_date)&(a_data.date <= end_date)]
a_data['material'] = a_data.code.map(lambda x: code_material_map.loc[x])
a_data['city'] = a_data.location.map(lambda x: city_map.loc[x])

# combine survey areas
# combine lugano and maggiore
a_data['river_bassin'] = a_data.river_bassin.where(a_data.river_bassin != 'tresa', 'ticino' )

# combine reuss and linth
a_data['river_bassin'] = a_data.river_bassin.where(a_data.river_bassin != 'reuss', 'linth' )

# convert meters of streets to kilometers
a_data['streets'] = a_data.streets.astype('int')
a_data['streets km'] = a_data.streets/1000

# make a reporting unit column, keep the pcs_m column
a_data[unit_label] = (a_data.pcs_m *reporting_unit).astype('int')

# Combine the different sizes of fragmented plastics and styrofoam
# the codes for the foams, these will be called Gfoam
some_foams = ['G81', 'G82', 'G83']

# the codes for the fragmented plastics, these will be called Gfrags
some_frag_plas = list(a_data[a_data.groupname == 'plastic pieces'].code.unique())

# this extracts and aggregates the members of the two code groups, returns a dataframe
the_plast_rows = sut.create_aggregate_groups(a_data, codes_to_agg=some_frag_plas,a_model_code="G79", a_new_code="Gfrags")
the_foam_rows = sut.create_aggregate_groups(a_data, codes_to_agg=some_foams, a_model_code="G82", a_new_code="Gfoam")

# the foam codes and fragmented plastic codes have been aggregated in to Gfrags and Gfoam
a_data = sut.replace_a_group_of_codes_with_one(a_data, new_code_values=[the_plast_rows, the_foam_rows], codes_to_replace=[*some_frag_plas, *some_foams])

# code and material totals
material_totals = a_data.groupby('material').quantity.sum()
code_totals = a_data.groupby(['code'], as_index=False).agg({unit_label:'mean', 'quantity':'sum'})

# cumulative statistics for each code
code_totals["% of total"] = ((code_totals.quantity/code_totals.quantity.sum())*100).round(2)
code_totals["fail"] = code_totals.code.map(lambda x: a_data[(a_data.code == x) & (a_data.quantity > 0)].loc_date.nunique())
code_totals["fail rate"] = ((code_totals.fail/a_data.loc_date.nunique())*100).astype('int')
code_totals.set_index('code', inplace=True)
code_totals['material'] = code_totals.index.map(lambda x: code_material_map[x])
code_totals['item'] = code_totals.index.map(lambda x: code_description_map[x])

# survey totals
dt_all = a_data.groupby(['loc_date','location','river_bassin', 'water_name_slug','city','date'], as_index=False).agg({unit_label:'sum', 'quantity':'sum'})

#!! feature data!!#
fd = a_data.copy()

# column headers for the survey area data
fd['survey area'] = fd.river_bassin.map(lambda x: ut.use_this_key(x, comp_labels))

# gather the municpalities and the population:
fd_pop_map = dfBeaches.loc[fd.location.unique()][['city','population']].copy()
fd_pop_map.drop_duplicates(inplace=True)
fd_pop_map.set_index('city', drop=True, inplace=True)

# survey_totals feature data
fd_dt=dt_all.copy()
fd_dt['month'] = fd_dt.date.dt.month

# map survey total quantity to loc_date
fd_dq = fd_dt[['loc_date', 'quantity']].set_index('loc_date')

# material totals:
fd_mat_t = code_totals[['material', 'quantity']].groupby('material', as_index=False).quantity.sum()
fd_mat_t["% of total"] = fd_mat_t.quantity/fd_mat_t.quantity.sum()

# material totals
fd_mat_totals = fd_mat_t.sort_values(by='quantity', ascending=False)

mat_cols_to_use = {'material':'Material','quantity':'Quantity', '% of total':'% of total'}
fd_mat_totals['% of total'] =(fd_mat_totals['% of total']*100).round(1)
fd_mat_totals['quantity']=fd_mat_totals.quantity.map(lambda x: '{:,}'.format(x))
fd_mat_totals['% of total']=fd_mat_totals['% of total'].map(lambda x:F"{x}%")
fd_mat_t = fd_mat_totals[mat_cols_to_use.keys()].values

# summary statistics:
fd_n_samps = len(fd_dt)
fd_n_obj = fd.quantity.sum()
fd_n_locs = fd.location.nunique()
fd_n_munis = len(fd_pop_map.index)
fd_effected_population = fd_pop_map.sum()

fd_locs = fd.location.unique()
fd_samps = fd.loc_date.unique()

# gather the dimensional data for the time frame from dfDims
fd_dims= dfDims[(dfDims.location.isin(fd_locs))&(dfDims.date >= start_date)&(dfDims.date <= end_date)].copy()

# map the survey area name to the dims data record
m_ap_to_survey_area = fd[['location', 'river_bassin']].drop_duplicates().to_dict(orient='records')
a_new_map = {x['location']:x['river_bassin'] for x in m_ap_to_survey_area}

# make a survey area column in the dims data
fd_dims['survey area'] = fd_dims.location.map(lambda x: ut.use_this_key(x, a_new_map))

# map length and area from dims to survey data
st_map = fd_dims[['loc_date', 'length', 'area']].to_dict(orient='records')
amap = {x['loc_date']:{'length':x['length'], 'area':x['area']}for x in st_map}
trans = {x:F"{x}"for x in fd.loc_date.unique()}

def this_map(x,amap,trans, var='length'):
    try:
        data = amap[trans[x]][var]
    except:
        data = 0
    return data  
    
fd['length'] = fd.loc_date.map(lambda x:  this_map(x,amap,trans, var='length'))
fd['area'] = fd.loc_date.map(lambda x:  this_map(x,amap,trans, var='area'))
fd['water'] = fd.location.map(lambda x: dfBeaches['water'][x])

# these surveys are missing area and length data. 
# use the average values from all the surveys at that location to fill in the missing valuesf
make_lengths = fd.loc[fd.location.isin(['baby-plage-geneva','quai-maria-belgia'])].groupby('location').agg({'length':'mean', 'area':'mean'})
fd.loc[fd.loc_date == ('baby-plage-geneva', '2021-04-14'), 'length'] = 84
fd.loc[fd.loc_date == ('baby-plage-geneva', '2021-04-14'), 'area'] = 355
fd.loc[fd.loc_date.isin([('quai-maria-belgia', '2021-02-28'), ('quai-maria-belgia', '2021-01-31')]), 'length'] = 34
fd.loc[fd.loc_date.isin([('quai-maria-belgia', '2021-02-28'), ('quai-maria-belgia', '2021-01-31')]), 'area'] = 145

# gather the land use data for each location
luse_this_study = dfBeaches.loc[fd_locs].copy()
sns.set_style('whitegrid')

# the locations in the area of walensee are missing luse dataf
# we are only comparing locations that have a complete profile
lu_th_st = luse_this_study[luse_this_study.luse_total > 650]

#! the data minus the locations with incomplete land use data
#! including the % ranking of the land use features
grtr_10 = fd[fd.location.isin(lu_th_st.index)].copy()
dt_nw = grtr_10.groupby(use_these_cols[:-2], as_index=False).agg({unit_label:'sum'})

# the cumulative distributions for the different survey areas
ecdfs = {x:{} for x in comps}
ecdfs.update({"All surveys":{}})

for i, n in enumerate(luse_exp):
    for element in comps:
        the_data = ECDF(dt_nw[dt_nw.river_bassin == element][n].values)
        ecdfs[element].update({n:the_data})
        x, y = the_data.x, the_data.y           
    a_all_surveys =  ECDF(dt_nw[n].values)
    ecdfs["All surveys"].update({n:a_all_surveys})

Out of the 385 surveys only those that were on lakes were considered for this analysis.

Survey results urban and rural locations March 2020 - May 2021. Left: Survey totals urban v/s rural, n=300. Right: distribution of survey results urban - rural with detail of code-group results.

# summary stats table
# labels for the .describe function
change_names = {'count':'# samples',
                'mean':F"average {unit_label}",
                'std':'standard deviation',
                '25%':'25%', 
                '50%':'50%',
                '75%':'75%',
                'max':F"max {unit_label}",
                'min':F"min {unit_label}",
                'total objects':'total objects',
                '# locations':'# locations',
                'survey year':'survey year'
               }

def anew_dict(x):
    new_dict = {}
    for param in x.index:
        new_dict.update({change_names[param]:x[param]})
    return new_dict  

#the most abundant
most_abundant = code_totals.sort_values(by="quantity", ascending=False)[:10].index

# found greater than 50% of the time
l_grtr_50 = code_totals[code_totals['fail rate'] >= 50].index

# the most common
most_common = list(set([*most_abundant, *l_grtr_50]))

# eleminate surveys less than 10m and greater than 100m
# restricts surveys to locations on lakes
grtr_10 = grtr_10[(grtr_10.water == 'l')].copy()

# add different date identifiers
grtr_10['month'] = grtr_10.date.dt.month
grtr_10["eom"] = grtr_10.date.map(lambda x: pd.Period(x,freq='M').end_time.date())

def add_pctile_rate(x,ecdfs,feature):
    data = ecdfs['All surveys'][feature](x)
    return np.round((data*100),1)  

# get the percentile ranking of the land use features for each location
grtr_10['b_group'] = grtr_10["% to buildings"].map(lambda x: add_pctile_rate(x,ecdfs,"% to buildings"))
grtr_10['a_group'] = grtr_10["% to agg"].map(lambda x: add_pctile_rate(x,ecdfs,"% to agg"))
grtr_10['w_group'] = grtr_10["% to woods"].map(lambda x: add_pctile_rate(x,ecdfs,"% to woods"))
grtr_10['r_group'] = grtr_10["% to recreation"].map(lambda x: add_pctile_rate(x,ecdfs,"% to recreation"))

# group all the land use % into called profile
# this is a unique identifier for each location
grtr_10['profile'] = list(zip(grtr_10["% to buildings"], grtr_10["% to agg"], grtr_10["% to woods"], grtr_10["% to recreation"]))

# identify rural and urban location
grtr_10['rural'] = ((grtr_10.w_group > 50) | (grtr_10.a_group > 50) ) & (grtr_10.b_group < 50)
grtr_10['rural'] = grtr_10['rural'].where(grtr_10['rural'] == True, 'urban')
grtr_10['rural'] = grtr_10['rural'].where(grtr_10['rural'] == 'urban', 'rural')

# make a list of the objects by their association to buildings, streets recreation and thier use
# food and drink
cont = ["G27", "G30", "G178", "G200"]

# objects with no or few associations and not related to tobacco or food
dist = list(set(most_common) - set(cont))

# lables for the two groups and a label to catch all the other objects
grtr_10['group'] = 'other'
grtr_10['group'] = grtr_10.group.where(~grtr_10.code.isin(dist), 'DG')
grtr_10['group'] = grtr_10.group.where(~grtr_10.code.isin(cont), 'CG')

# survey totals of all locations with its land use profile
initial = ['loc_date','date','rural','group','a_group', 'b_group', 'r_group', 'w_group', 'streets', 'intersects']
grtr_10dt=grtr_10.groupby(initial, as_index=False).agg({unit_label:"sum", "quantity":"sum"})
grtr_10qkey = grtr_10dt[['loc_date', 'quantity']].set_index('loc_date')

# assigning survey totals to the cont and dist survey totals
def wtf(x, grtr_10key):
    return grtr_10key.loc[[x]].values[0][0]

# survey totals of contributed and distributed objects, 
second = ['loc_date', 'group', 'rural', 'date','eom', 'river_bassin','location','a_group', 'b_group', 'r_group', 'w_group', 'streets', 'intersects','profile']
grtr_10dtc=grtr_10.groupby(second, as_index=False).agg({unit_label:"sum", "quantity":"sum"})
# adding the survey total of all objects to each record
grtr_10dtc['dt']= grtr_10dtc.loc_date.map(lambda x:fd_dq.loc[[x], 'quantity'][0])

# calculating the % total of contributed and distributed at each survey
grtr_10dtc['pt']= grtr_10dtc.quantity/grtr_10dtc.dt

# the_dists = grtr_10dtc[grtr_10dtc.group == 'DG']
# the_conts = grtr_10dtc[grtr_10dtc.group == 'CG']

less_than = grtr_10dtc[(grtr_10dtc['rural'] == 'rural')].location.unique()
grt_than = grtr_10dtc[(grtr_10dtc['rural'] == 'urban')].location.unique()
grt_dtr = grtr_10dtc.groupby(['loc_date', 'date','rural'], as_index=False)[unit_label].agg({unit_label:"sum"})
# months locator, can be confusing
# https://matplotlib.org/stable/api/dates_api.html
months = mdates.MonthLocator(interval=1)
months_fmt = mdates.DateFormatter('%b')
days = mdates.DayLocator(interval=7)
fig, axs = plt.subplots(1,2, figsize=(11,6), sharey=True)

group_palette = {'CG':'magenta', 'DG':'teal', 'other':'tan'}
rural_palette = {'rural':'black', 'urban':'salmon' }

ax = axs[0]
sns.scatterplot(data=grt_dtr, x='date', y=unit_label, hue='rural', s=80, palette=rural_palette, alpha=0.6, ax=ax)

ax.set_ylim(0,grt_dtr[unit_label].quantile(.98)+50 )

ax.set_xlabel("")
ax.set_ylabel(unit_label, **ck.xlab_k14)

ax.xaxis.set_minor_locator(days)
ax.xaxis.set_major_formatter(months_fmt)

axtwo = axs[1]

box_props = {
    'boxprops':{'facecolor':'none', 'edgecolor':'black'},
    'medianprops':{'color':'black'},
    'whiskerprops':{'color':'black'},
    'capprops':{'color':'black'}
}

sns.boxplot(data=grt_dtr, x='rural', y=unit_label, dodge=False, showfliers=False, ax=axtwo, **box_props)
sns.stripplot(data=grtr_10dtc,x='rural', y=unit_label, ax=axtwo, zorder=1, hue='group', palette=group_palette, jitter=.35, alpha=0.3, s=8)
axtwo.set_ylabel(unit_label, **ck.xlab_k14)

ax.tick_params(which='both', axis='both', labelsize=14)
axtwo.tick_params(which='both', axis='both', labelsize=14)
axtwo.set_xlabel(" ")

plt.tight_layout()
plt.show()
plt.close()
../_images/assessing_transport_18_0.png

Summary data all survey totals

change_names = {'count':'# samples', 
                'mean':F"average {unit_label}",
                'std':'standard deviation', 
                'min p/50m':'min', '25%':'25%',
                '50%':'50%', '75%':'75%',
                'max':F"max {unit_label}", 'min':F"min {unit_label}",
                'total objects':'total objects',
                '# locations':'# locations',
                'survey year':'survey year'
               }

# convenience function to change the index names in a series
def anew_dict(x):
    new_dict = {}
    for param in x.index:
        new_dict.update({change_names[param]:x[param]})
    return new_dict  

# select data
data = grt_dtr

# get the basic statistics from pd.describe
desc_2020 = data.groupby('rural')[unit_label].describe()
desc_2020.loc['all surveys', ['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max']] = grt_dtr.groupby(['loc_date', 'date'])[unit_label].sum().describe().to_numpy()
desc = desc_2020.astype('int')
desc = desc.applymap(lambda x: F"{x:,}")
desc.rename(columns={'count':'samples'}, inplace= True)
desc.reset_index(inplace=True)

# make tables
fig, axs = plt.subplots(figsize=(7,3.4))

# summary table
# names for the table columns
a_col = [top_name[0], 'total']

axone = axs
ut.hide_spines_ticks_grids(axone)

a_table = axone.table(cellText=desc.values,  colLabels=desc.columns, colWidths=[.19,*[.1]*8], loc='lower center', bbox=[0,0,1,1])
the_material_table_data = sut.make_a_summary_table(a_table,desc.values,desc.columns, s_et_bottom_row=False)


plt.tight_layout()
plt.subplots_adjust(wspace=0.2)
plt.show()
../_images/assessing_transport_20_0.png

20.4. Differences between urban and rural survey totals

Survey results in rural locations had a lower median and mean than urban locations and all locations combined. The maximum and the minimum values as well as the highest standard deviation were recorded at rural locations.

20.4.1. Confidence intervals (CIs) of the median survey results

The upper range of the median survey results from rural location is less than the lower range of the median survey result from urban locations.

# this code was modified from this source:
# http://bebi103.caltech.edu.s3-website-us-east-1.amazonaws.com/2019a/content/recitations/bootstrapping.html
# if you want to get the confidence interval around another point estimate use np.percentile
# and add the percentile value as a parameter

def draw_bs_sample(data):
    """Draw a bootstrap sample from a 1D data set."""
    return np.random.choice(data, size=len(data))

def compute_jackknife_reps(data, statfunction=None, stat_param=False):
    '''Returns jackknife resampled replicates for the given data and statistical function'''
    # Set up empty array to store jackknife replicates
    jack_reps = np.empty(len(data))

    # For each observation in the dataset, compute the statistical function on the sample
    # with that observation removed
    for i in range(len(data)):
        jack_sample = np.delete(data, i)
        if not stat_param:
            jack_reps[i] = statfunction(jack_sample)
        else:
            jack_reps[i] = statfunction(jack_sample, stat_param)          
        
    return jack_reps


def compute_a(jack_reps):
    '''Returns the acceleration constant a'''

    mean = np.mean(jack_reps)
    try:
        a = sum([(x**-(i+1)- (mean**-(i+1)))**3 for i,x in enumerate(jack_reps)])
        b = sum([(x**-(i+1)-mean-(i+1))**2 for i,x in enumerate(jack_reps)])
        c = 6*(b**(3/2))
        data = a/c
    except:
        print(mean)
    return data


def bootstrap_replicates(data, n_reps=1000, statfunction=None, stat_param=False):
    '''Computes n_reps number of bootstrap replicates for given data and statistical function'''
    boot_reps = np.empty(n_reps)
    for i in range(n_reps):
        if not stat_param:
            boot_reps[i] = statfunction(draw_bs_sample(data))
        else:
            boot_reps[i] = statfunction(draw_bs_sample(data), stat_param)     
        
    return boot_reps


def compute_z0(data, boot_reps, statfunction=None, stat_param=False):
    '''Computes z0 for given data and statistical function'''
    if not stat_param:
        s = statfunction(data)
    else:
        s = statfunction(data, stat_param)
    return stats.norm.ppf(np.sum(boot_reps < s) / len(boot_reps))


def compute_bca_ci(data, alpha_level, n_reps=1000, statfunction=None, stat_param=False):
    '''Returns BCa confidence interval for given data at given alpha level'''
    # Compute bootstrap and jackknife replicates
    boot_reps = bootstrap_replicates(data, n_reps, statfunction=statfunction, stat_param=stat_param)
    jack_reps = compute_jackknife_reps(data, statfunction=statfunction, stat_param=stat_param)

    # Compute a and z0
    a = compute_a(jack_reps)
    z0 = compute_z0(data, boot_reps, statfunction=statfunction, stat_param=stat_param)

    # Compute confidence interval indices
    alphas = np.array([alpha_level/2., 1-alpha_level/2.])
    zs = z0 + stats.norm.ppf(alphas).reshape(alphas.shape+(1,)*z0.ndim)
    avals = stats.norm.cdf(z0 + zs/(1-a*zs))
    ints = np.round((len(boot_reps)-1)*avals)
    ints = np.nan_to_num(ints).astype('int')

    # Compute confidence interval
    boot_reps = np.sort(boot_reps)
    ci_low = boot_reps[ints[0]]
    ci_high = boot_reps[ints[1]]
    return (ci_low, ci_high)


the_bcas = {}

an_int = 50

# rural cis
r_median = grt_dtr[grt_dtr.rural == 'rural'][unit_label].median()
a_result = compute_bca_ci(grt_dtr[grt_dtr.rural == 'rural'][unit_label].to_numpy(), .05, n_reps=5000, statfunction=np.percentile, stat_param=an_int)
r_cis = {'rural':{"lower 2.5%":a_result[0], "median":r_median, "upper 97.5%": a_result[1] }}
the_bcas.update(r_cis)
# urban cis
u_median = grt_dtr[grt_dtr.rural == 'urban'][unit_label].median()
a_result = compute_bca_ci(grt_dtr[grt_dtr.rural == 'urban'][unit_label].to_numpy(), .05, n_reps=5000, statfunction=np.percentile, stat_param=an_int)
u_cis = {'urban':{"lower 2.5%":a_result[0], "median":u_median, "upper 97.5%": a_result[1] }}
the_bcas.update(u_cis)
# all surveys
u_median = grt_dtr[unit_label].median()
a_result = compute_bca_ci(grt_dtr[unit_label].to_numpy(), .05, n_reps=5000, statfunction=np.percentile, stat_param=an_int)
all_cis = {'all surveys':{"lower 2.5%":a_result[0], "median":u_median, "upper 97.5%": a_result[1] }}

# combine the results:
the_bcas.update(all_cis)

# make a df
the_cis = pd.DataFrame(the_bcas)
the_cis.reset_index(inplace=True)

fig, axs = plt.subplots()

data = the_cis.values
collabels = the_cis.columns
ut.hide_spines_ticks_grids(axs)

the_first_table_data = axs.table(data, colLabels=collabels, colWidths=[*[.2]*5], bbox=[0, 0, 1, 1])

a_summary_table_one = sut.make_a_summary_table(the_first_table_data,data,collabels, a_color, s_et_bottom_row=True)

a_summary_table_one.get_celld()[(0,0)].get_text().set_text(" ")

plt.show()
plt.close()
../_images/assessing_transport_22_0.png

The 95% confidence interval of survey totals under urban and rural land use conditions

20.5. Survey results of CG and DG with respect to land use

Recall that the most common objects were grouped according to the value of the Spearmans ranked correlation test of p/100m with respect to the land use profile, (annex 1). Creating two groups, one group of objects that has a positive association with percent of land attributed to buildings and one that does not. Cotton swabs are included with the objects that have few or no associations with land use, because they are normally expelled by water treatment plants directly into the nearest body of water.

20.5.1. Assessment of composition

The ratio of DG to CG in the rural group was 1.7, in the urban group it was 0.81. On a per survey basis, DG were a greater percent of the total in all surveys from rural locations. In urban locations the ratio of DG to CG is the inverse of the rural locations and for approximately 20% of the surveys in urban locations the ratio of DG to CG is very close to 1.

Sample results from rural locations had a greater portion of trash attributed to fragmented plastics, construction plastics and foams.

# combine the two object groups into one data frame
DG = "DG"
CG = "CG"

dists = grtr_10dtc[(grtr_10dtc.group == DG)][['loc_date', 'location', unit_label]].set_index('loc_date')
conts = grtr_10dtc[(grtr_10dtc.group == CG)][['loc_date', 'location',  unit_label]].set_index('loc_date')
conts.rename(columns={unit_label:CG}, inplace=True)
dists.rename(columns={unit_label:DG}, inplace=True)
c_v_d = pd.concat([dists, conts], axis=0)
c_v_d['dt'] = c_v_d[DG]/c_v_d[CG]

# the ratio of dist to cont under the two land use conditions
ratio_of_d_c_agg = c_v_d[c_v_d.location.isin(less_than)][DG].sum()/c_v_d[c_v_d.location.isin(less_than)][CG].sum()
ratio_of_d_c_urb= c_v_d[c_v_d.location.isin(grt_than)][DG].sum()/c_v_d[c_v_d.location.isin(grt_than)][CG].sum()

# chart that
fig, ax = plt.subplots(figsize=(6,5))

# get the ecdf of percent of total for each object group under each condition
# p of t urban
co_agecdf = ECDF(grtr_10dtc[(grtr_10dtc.location.isin(grt_than))&(grtr_10dtc.group.isin([CG]))]["pt"])
di_agecdf = ECDF(grtr_10dtc[(grtr_10dtc.location.isin(grt_than))&(grtr_10dtc.group.isin([DG]))]["pt"])

# p of t rural
cont_ecdf = ECDF(grtr_10dtc[(grtr_10dtc.location.isin(less_than))&(grtr_10dtc.group.isin([CG]))]["pt"])
dist_ecdf = ECDF(grtr_10dtc[(grtr_10dtc.location.isin(less_than))&(grtr_10dtc.group.isin([DG]))]["pt"])

sns.lineplot(x=cont_ecdf.x, y=cont_ecdf.y, color='salmon', label="rural: CG", ax=ax)
sns.lineplot(x=co_agecdf.x, y=co_agecdf.y, color='magenta', ax=ax, label="urban: CG")
sns.lineplot(x=dist_ecdf.x, y=dist_ecdf.y, color='teal', label="rural: DG", ax=ax)
sns.lineplot(x=di_agecdf.x, y=di_agecdf.y, color='black', label="urban: DG", ax=ax)

ax.set_xlabel("% of survey total", **ck.xlab_k14)
ax.set_ylabel("% of surveys", **ck.xlab_k14)
plt.legend(loc='lower right', title="% of total")

plt.show()
../_images/assessing_transport_26_0.png

Difference in composition of rural and urban litter surveys

Under different land use conditions 9/10 of the most common objects are the same in rural and urban settings. There are two exceptions:

  • plastic construction waste made the top ten in rural settings

  • industrial pellets made the top ten in urban settings

The most common objects under different land use profiles. Left: rural, right: urban

rur_10 = grtr_10[grtr_10.location.isin(less_than)&(grtr_10.code.isin(most_common))].groupby('code', as_index=False).quantity.sum().sort_values(by='quantity', ascending=False)
urb_10 = grtr_10[grtr_10.location.isin(grt_than)&(grtr_10.code.isin(most_common))].groupby('code', as_index=False).quantity.sum().sort_values(by='quantity', ascending=False)

rur_tot = grtr_10[grtr_10.location.isin(less_than)].quantity.sum()
urb_tot = grtr_10[grtr_10.location.isin(grt_than)].quantity.sum()

rur_10['item'] = rur_10.code.map(lambda x: code_description_map.loc[x])
urb_10['item'] = urb_10.code.map(lambda x: code_description_map.loc[x])

rur_10["% of total"] = ((rur_10.quantity/rur_tot)*100).round(1)
urb_10["% of total"] = ((urb_10.quantity/urb_tot)*100).round(1)

# make tables
fig, axs = plt.subplots(1, 2, figsize=(10,len(most_common)*.8))

# summary table
# names for the table columns
a_col = [top_name[0], 'total']

axone = axs[0]
axtwo = axs[1]

ut.hide_spines_ticks_grids(axone)
ut.hide_spines_ticks_grids(axtwo)

data_one = rur_10[['item', 'quantity', "% of total"]].copy()
data_two = urb_10[['item', 'quantity', "% of total"]].copy()

for a_df in [data_one, data_two]:
    a_df['quantity'] = a_df.quantity.map(lambda x: F"{x:,}")

a_table = axone.table(cellText=data_one.values,  colLabels=data_one.columns, colWidths=[.6,*[.2]*2], loc='lower center', bbox=[0,0,1,1])
the_material_table_data = sut.make_a_summary_table(a_table,data_one.values,data_one.columns, s_et_bottom_row=True)

a_table = axtwo.table(cellText=data_two.values,  colLabels=data_one.columns, colWidths=[.6,*[.2]*2], loc='lower center', bbox=[0,0,1,1])
the_material_table_data = sut.make_a_summary_table(a_table,data_one.values,data_one.columns, s_et_bottom_row=True)

axone.set_xlabel("rural", **ck.xlab_k14)
axtwo.set_xlabel("urban", **ck.xlab_k14)
plt.tight_layout()
plt.subplots_adjust(wspace=0.2)
plt.show()
../_images/assessing_transport_30_0.png

20.5.2. Distribution of survey totals

The survey results of the DG are very similar under both land use classes, there is more variance as the reported value increases but not so much that the distributions diverge. Given the standard deviation of the samples and the high variance of beach-litter-survey data in general this is expected. [HG19]

The two sample Kolmogorov-Smirnov(KS) test(ks=0.073, p=0.808) of the two sets of survey results suggest that the survey results of DG may not be significantly different between the two land use classes. The results from the Mann-Whitney U (MWU) (U=11445.0, p=0.762) *suggest that it is possible that the two distributions are the same.**[sc21a] [sc21b]

Empirical Cumulative Distribution (ECDF) of the survey results of DG and CG objects under the different land use classes

left: DG , right: CG

dist_results_agg = grtr_10dtc[(grtr_10dtc.location.isin(less_than))&(grtr_10dtc.group == DG)].groupby(['loc_date', 'group'])[unit_label].sum().values
dist_results_urb = grtr_10dtc[(grtr_10dtc.location.isin(grt_than))&(grtr_10dtc.group == DG)].groupby(['loc_date', 'group'])[unit_label].sum().values
a_d_ecdf = ECDF(dist_results_agg )
b_d_ecdf = ECDF(dist_results_urb )

cont_results_agg = grtr_10dtc[(grtr_10dtc.location.isin(less_than))&(grtr_10dtc.group == CG)].groupby('loc_date')[unit_label].sum().values
cont_results_urb = grtr_10dtc[(grtr_10dtc.location.isin(grt_than))&(grtr_10dtc.group == CG)].groupby('loc_date')[unit_label].sum().values
a_d_ecdf_cont = ECDF(cont_results_agg)
b_d_ecdf_cont = ECDF(cont_results_urb)

fig, ax = plt.subplots(1,2, figsize=(8,5))

axone = ax[0]
sns.lineplot(x=a_d_ecdf.x, y=a_d_ecdf.y, color='salmon', label="rural", ax=axone)
sns.lineplot(x=b_d_ecdf.x, y=b_d_ecdf.y, color='black', label="urban", ax=axone)
axone.set_xlabel(unit_label, **ck.xlab_k14)
axone.set_ylabel('% of surveys', **ck.xlab_k14)
axone.legend(fontsize=12, title=DG,title_fontsize=14)


axtwo = ax[1]
sns.lineplot(x=a_d_ecdf_cont.x, y=a_d_ecdf_cont.y, color='salmon', label="rural", ax=axtwo)
sns.lineplot(x=b_d_ecdf_cont.x, y=b_d_ecdf_cont.y, color='black', label="urban", ax=axtwo)

axtwo.set_xlabel(unit_label, **ck.xlab_k14)
axtwo.set_ylabel(' ')
axtwo.legend(fontsize=12, title=CG,title_fontsize=14)

plt.show()
../_images/assessing_transport_33_0.png

On the other hand the CG survey results diverge almost immediately according to the land use class. This test results supports the decision to reject the null hypothesis of the Spearmans ranked correlation test for this group of codes and land use profile. The rural locations have less buildings and more agriculture or woods. The two conditions together should reduce the amount of tobacco and food wrappers for that class.

The KS and MWU tests both agree with the visual results that rural and urban CG survey results most likely come from different distributions and they are not equal (ks=0.284, pvalue<.0001), (U=7559.0, pvalue<.0001).

20.5.2.1. Difference of means

The average survey result of DG objects in rural locations was 139.86p/100m in urban locations and 117.75p/100m. A permutation test on the difference of means was conducted on the condition rural - urban.

Difference of means DG objects. \(\mu_{rural}\) - \(\mu_{urban}\), method=shuffle, permutations=5000.

# print(stats.ks_2samp(dist_results_agg, dist_results_urb, alternative='two-sided', mode='auto'))
# print(stats.mannwhitneyu(dist_results_agg,dist_results_urb, alternative='two-sided'))

# #the results for contributed objects
# print(stats.ks_2samp(cont_results_agg, cont_results_urb, alternative='two-sided', mode='auto'))
# print(stats.mannwhitneyu(cont_results_agg, cont_results_urb, alternative='two-sided'))

# pemutation test: of difference of means HD objects
agg_dobj = grtr_10[(grtr_10.location.isin(less_than))&(grtr_10.group == DG)].groupby(['loc_date'], as_index=False)[unit_label].sum()
buld_obj = grtr_10[(grtr_10.location.isin(grt_than))&(grtr_10.group == DG)].groupby(['loc_date'], as_index=False)[unit_label].sum()
# label the urban and rural results
agg_dobj['class'] = 'rural'
buld_obj['class'] = 'urban'

# merge into one 
objs_merged = agg_dobj.append(buld_obj)

# store the mean per class
the_mean = objs_merged.groupby('class')[unit_label].mean()

# store the difference
mean_diff = the_mean.loc['rural'] - the_mean.loc['urban']
new_means=[]
# permutation resampling:
for element in np.arange(5000):
    objs_merged['new_class'] = objs_merged['class'].sample(frac=1).values
    b=objs_merged.groupby('new_class').mean()
    new_means.append((b.loc['rural'] - b.loc['urban']).values[0])
emp_p = np.count_nonzero(new_means <= (the_mean.loc['rural'] - the_mean.loc['urban'])) / 5000

# chart the results
fig, ax = plt.subplots()

sns.histplot(new_means, ax=ax)
ax.set_title(F"$\u0394\mu$ = {np.round(mean_diff, 2)}, perm=5000, p={emp_p} ", **ck.title_k14)
ax.set_ylabel('permutations', **ck.xlab_k14)
ax.set_xlabel("$\mu$ rural - $\mu$ urban", **ck.xlab_k14)
plt.show()
../_images/assessing_transport_36_0.png

Refuse to reject the null hypothesis that these two distributions may be the same

The test of the difference of medians supports the observations that DG survey results occur at similar rates indifferent of the land use profile.

20.5.3. Seasonal variations

Seasonal variations of beach litter survey results has been documented under many conditions and many environments. In 2018 the SLR [Bla18] reported the maximum value in July and the minimum in November. The year 2020-2021 presents the same results.

# the survey results to test
corr_data = grtr_10[(grtr_10.code.isin(most_common))].copy()
results_sprmns = {}
for i,code in enumerate(most_common):
    data = corr_data[corr_data.code == code]
    for j, n in enumerate(luse_exp):
        corr, a_p = stats.spearmanr(data[n], data[unit_label])
        results_sprmns.update({code:{"rho":corr, 'p':a_p}})

# helper dict for converting ints to months
months={
    0:'Jan',
    1:'Feb',
    2:'Mar',
    3:'Apr',
    4:'May',
    5:'Jun',
    6:'Jul',
    7:'Aug',
    8:'Sep',
    9:'Oct',
    10:'Nov',
    11:'Dec'
}

m_dt = grtr_10.groupby(['loc_date', 'date','group'], as_index=False).agg({'quantity':'sum', unit_label:'sum'})

# sample totals all objects
m_dt_t = grtr_10.groupby(['loc_date','date','month', 'eom'], as_index=False).agg({unit_label:'sum'})
m_dt_t.set_index('date', inplace=True)

# data montlhy median survey results contributed, distributed and survey total
fxi=m_dt.set_index('date', drop=True)
data1 = fxi[fxi.group == CG][unit_label].resample("M").mean()
data2 = fxi[fxi.group == DG][unit_label].resample("M").mean()

# data3 = m_dt_t[unit_label].resample("M").mean()

# helper tool for months in integer order
def new_month(x):
    if x <= 11:
        this_month = x
    else:
        this_month=x-12    
    return this_month


# the monthly average discharge rate of the three rivers where the majority of the samples are
aare_schonau = [61.9, 53, 61.5, 105, 161, 155, 295, 244, 150, 106, 93, 55.2, 74.6, 100, 73.6, 92.1]
rhone_scex =   [152, 144, 146, 155, 253, 277, 317, 291, 193, 158, 137, 129, 150, 146, 121, 107]
linth_weesen = [25.3, 50.7, 40.3, 44.3, 64.5, 63.2, 66.2, 61.5, 55.9, 52.5, 35.2, 30.5, 26.1, 42.0, 36.9]

fig, ax = plt.subplots()
    
this_x = [i for i,x in  enumerate(data1.index)]
this_month = [x.month for i,x in enumerate(data1.index)]

twin_ax = ax.twinx()
twin_ax.grid(None)

ax.bar(this_x, data1.to_numpy(), label='contributed', bottom=data2.to_numpy(), linewidth=1, color="salmon", alpha=0.6)
ax.bar([i for i,x in  enumerate(data2.index)], data2.to_numpy(), label='distributed', linewidth=1,color="darkslategray", alpha=0.6)

sns.scatterplot(x=this_x,y=[*aare_schonau[2:], np.mean(aare_schonau)], color='turquoise',  edgecolor='magenta', linewidth=1, s=60, label='Aare m³/s', ax=twin_ax)
sns.scatterplot(x=this_x,y=[*rhone_scex[2:], np.mean(rhone_scex)], color='royalblue',  edgecolor='magenta', linewidth=1, s=60, label='Rhône m³/s', ax=twin_ax)
sns.scatterplot(x=this_x,y=[*linth_weesen[2:], np.mean(linth_weesen), np.mean(linth_weesen)], color='orange', edgecolor='magenta', linewidth=1, s=60, label='Linth m³/s', ax=twin_ax)
handles, labels = ax.get_legend_handles_labels()
handles2, labels2 = twin_ax.get_legend_handles_labels()
ax.xaxis.set_major_locator(ticker.FixedLocator([i for i in np.arange(len(this_x))]))

ax.set_ylabel(unit_label, **ck.xlab_k14)
twin_ax.set_ylabel("m³/sec", **ck.xlab_k14)

axisticks = ax.get_xticks()
labelsx = [months[new_month(x-1)] for x in  this_month]

plt.xticks(ticks=axisticks, labels=labelsx)
plt.legend([*handles, *handles2], [*labels, *labels2], bbox_to_anchor=(0,-.1), loc='upper left', fontsize=14)

plt.show()
../_images/assessing_transport_40_0.png

monthly survey results and river discharge rates m³/second

April and May 2021 are rolling averages, data not available

source : https://www.hydrodaten.admin.ch/en/stations-and-data.html?entrance_medium=menu

20.6. The survey results of FP and FT with respect to land use

Results of KS test and Mann Whitney U

The survey results for FP objects is very similar up to \(\approx\) the 85th percentile where the rural survey results are noticeably larger. Suggesting that extreme values for FP were more likely in rural locations. According to the KS test (ks=0.092, pvalue=0.50) and MWU test (U=11592.0, pvalue=0.62) the distribution of FP objects under the two land use classes is not significantly different and may be equal.

The survey results for FT objects maintain the same features as the parent distribution. The results of the KS test (ks=0.29, pvalue<.001) and MWU test (U=7647.5, p<.001) agree with the results of the parent group, that there is a statistically relevant difference between the survey results under different land use classes.

Left rural - urban: ECDF of survey results fragmented plastics and foams (FP)

Right rural - urban: ECDF of survey results cigarette ends and candy wrappers (FT)

agg_dobj = grtr_10[(grtr_10.location.isin(less_than))&(grtr_10.code.isin(['Gfrags', 'Gfoam']))].groupby(['loc_date'])[unit_label].sum().values
buld_obj = grtr_10[(grtr_10.location.isin(grt_than))&(grtr_10.code.isin(['Gfrags', 'Gfoam']))].groupby(['loc_date'])[unit_label].sum().values

a_d_ecdf = ECDF(agg_dobj)
b_d_ecdf = ECDF(buld_obj)

# print(len(a_d_ecdf.x))
# print(len(b_d_ecdf.x))
# print(a_d_ecdf(17), b_d_ecdf(15))
# print(np.quantile(a_d_ecdf.x,.65))
# print(np.quantile(b_d_ecdf.x, .65))

agg_cont = grtr_10[(grtr_10.location.isin(less_than))&(grtr_10.code.isin(['G27', 'G30']))].groupby(['loc_date'])[unit_label].sum().values
b_cont = grtr_10[(grtr_10.location.isin(grt_than))&(grtr_10.code.isin(['G27', 'G30']))].groupby(['loc_date'])[unit_label].sum().values

a_c_ecdf = ECDF(agg_cont)
b_c_ecdf = ECDF(b_cont)

fig, ax = plt.subplots(1,2, figsize=(8,5))

axone = ax[0]
sns.lineplot(x=a_d_ecdf.x, y=a_d_ecdf.y, color='salmon', label="rural", ax=axone)
sns.lineplot(x=b_d_ecdf.x, y=b_d_ecdf.y, color='black', label="urban", ax=axone)
axone.set_xlabel(unit_label, **ck.xlab_k14)
axone.set_ylabel('% of surveys', **ck.xlab_k14)

axone.legend(fontsize=12, title='FP',title_fontsize=14)



axtwo = ax[1]
sns.lineplot(x=a_c_ecdf.x, y=a_c_ecdf.y, color='salmon', label="rural", ax=axtwo)
sns.lineplot(x=b_c_ecdf.x, y=b_c_ecdf.y, color='black', label="urban", ax=axtwo)

axtwo.set_xlabel(unit_label, **ck.xlab_k14)
axtwo.set_ylabel(' ')

# ax.legend(fontsize=12, title='TITLE (fs=30)',title_fontsize=30)

axtwo.legend(fontsize=12, title='FT',title_fontsize=14)
# axtwo.get_legend().set_title("Fragmented plastics")

plt.tight_layout()

plt.show()
../_images/assessing_transport_44_0.png

20.6.1. FP and FT difference of means.

The average survey result of FP objects in rural locations was 22.93p/50m in urban locations it was 12p/50m. A permutation test on the difference of means was conducted on the condition rural - urban.

Difference of means fragmented foams and plastics under the two different land use classes.

\(\mu_{rural}\) - \(\mu_{urban}\), method=shuffle, permutations=5000

# pemutation test: of difference of means FP objects
agg_dobj = grtr_10[(grtr_10.location.isin(less_than))&(grtr_10.code.isin(['Gfrags', 'G89']))].groupby(['loc_date'], as_index=False)[unit_label].sum()
buld_obj = grtr_10[(grtr_10.location.isin(grt_than))&(grtr_10.code.isin(['Gfrags', 'G89']))].groupby(['loc_date'], as_index=False)[unit_label].sum()
# label the urban and rural results
agg_dobj['class'] = 'rural'
buld_obj['class'] = 'urban'

# merge into one 
objs_merged = agg_dobj.append(buld_obj)

# store the mean per class
the_mean = objs_merged.groupby('class')[unit_label].mean()

# store the difference
mean_diff = the_mean.loc['rural'] - the_mean.loc['urban']
new_means=[]

# permutation resampling:
for element in np.arange(5000):
    objs_merged['new_class'] = objs_merged['class'].sample(frac=1).values
    b=objs_merged.groupby('new_class').mean()
    new_means.append((b.loc['rural'] - b.loc['urban']).values[0])
emp_p = np.count_nonzero(new_means <= (the_mean.loc['rural'] - the_mean.loc['urban'])) / 5000

# chart the results
fig, ax = plt.subplots()

sns.histplot(new_means, ax=ax)
ax.set_title(F"$\u0394\mu$ = {np.round(mean_diff, 2)}, perm=5000, p={emp_p} ", **ck.title_k14)
ax.set_ylabel('permutations', **ck.xlab_k14)
ax.set_xlabel("$\mu$ rural - $\mu$ urban", **ck.xlab_k14)
plt.show()
../_images/assessing_transport_48_0.png

Refuse to reject the null hypotheses: there is no statistically significant difference between the two distributions

Difference of means cigarette ends and snack wrappers under the two different land use classes.

\(\mu_{rural}\) - \(\mu_{urban}\), urban method=shuffle, permutations=5000

# pemutation test: of difference of means food objects
agg_cont = grtr_10[(grtr_10.location.isin(less_than))&(grtr_10.code.isin(['G27', 'G30']))].groupby(['loc_date'], as_index=False)[unit_label].sum()
b_cont = grtr_10[(grtr_10.location.isin(grt_than))&(grtr_10.code.isin(['G27', 'G30']))].groupby(['loc_date'], as_index=False)[unit_label].sum()
# label the urban and rural results
agg_cont['class'] = 'rural'
b_cont['class'] = 'urban'

# merge into one 
objs_merged = agg_cont.append(b_cont)

# store the mean per class
the_mean = objs_merged.groupby('class')[unit_label].mean()

# store the difference
mean_diff = the_mean.loc['rural'] - the_mean.loc['urban']

# permutation resampling:
for element in np.arange(5000):
    objs_merged['new_class'] = objs_merged['class'].sample(frac=1).values
    b=objs_merged.groupby('new_class').mean()
    new_means.append((b.loc['rural'] - b.loc['urban']).values[0])
emp_p = np.count_nonzero(new_means <= (the_mean.loc['rural'] - the_mean.loc['urban'])) / 5000

# chart the results
fig, ax = plt.subplots()

sns.histplot(new_means, ax=ax)
ax.set_title(F"$\u0394\mu$ = {np.round(mean_diff, 2)}, perm=5000, p={emp_p} ", **ck.title_k14)
ax.set_ylabel('permutations', **ck.xlab_k14)
ax.set_xlabel("$\mu$ rural - $\mu$ urban", **ck.xlab_k14)
plt.show()
../_images/assessing_transport_51_0.png

Reject the null hypothesis: the two distributions are most likely not the same

20.7. Discussion

A positive statistically relevant association between CG objects and land use attributed to infrastructure such as streets, recreation areas and buildings can be assumed. Representing 4/12 most common objects, they are \(\approx\) 28% of all objects found and can be associated to activities within 1500m of the survey location. The distribution of these objects changes under the two land use classes.

In contrast the DG group has an \(\approx\) distribution under the different land use classes and no association to percent of land attributed to buildings. Composed of construction plastics, fragmented foams and plastics and industrial pellets the DG represents a diverse group of objects with different densities. With no statistical evidence to the contrary the null hypothesis cannot be rejected.

Changes in the quantities of DG objects are not directly related to the measured land use characteristics and are \(\approx\) under all land use classes. Therefore it can not be assumed that the primary source was within 1500m of the survey location and that a portion of these objects have origins upstream (economically and geographically).

Establishing objective criteria. Identifying and counting the objects after a litter survey

output = io.BytesIO()

this_image = PILImage.open("resources/codegroups/images/takingnotes.jpg")
this_image.thumbnail((800, 1200))
this_image.save(output, format='PNG')
encoded_string = base64.b64encode(output.getvalue()).decode()

html = '<img src="data:image/png;base64,{}"/>'.format(encoded_string)
IPython.display.HTML(html)

The data is then entered into the application by smartphone or pc

20.7.1. Collecting objective criteria

The requirement of objective criteria is a way of ensuring that the principle of causality is applied appropriately. Beach litter survey results provide the basic elements of objective criteria:

  1. The GPS coordinates where the objects were found

  2. The date the objects were found

  3. The quantity of objects found

  4. A description of the object and its uses in the economy

  5. The material type of the object

  6. The dimensions of the area surveyed

There are limits to the information that can be extracted from beach-litter surveys given the current protocol. These limits are related to the costs to complete one survey and the resulting number of samples per sampling period.

Care must be taken to maintain objectivity throughout the process. The best way to estimate the current state is having as many objective samples as possible within the region and sampling period of study.Currently the protocol allows for a proven, repeatable, broad-based assessment with minimal financial burden and almost no fixed costs. Thus allowing all stakeholders a chance to create partnerships and assess performance at different administrative and regional levels.

20.7.2. Creating partnerships

The methods used to establish objective criteria should also be used in all benchmark and baseline calculations. Having benchmarks and goals are essential in establishing meaning full partnerships and determining return on investment

20.7.2.1. Finding partners

The results from the test indicate that CG objects are more prevalent in urban locations. Urban was defined as the land use within 1500m of the survey area. From this it is safe to assume that the cause(s) of CG group litter are also more prevalent in urban areas and that the secondary cause of the litter is within 1500m of the survey location.

Stakeholders looking to reduce the incidence of CG objects within a specific zone may have a better chance of finding motivated partners within 1500m of the location of concern.

The DG group has the particularity that it is distributed in \(\approx\) rates indifferent of the land use and it makes up a larger proportion of the objects found than CG. This implies that the solution is at a larger scale than the municipal boundaries.

Fragmented plastics is the DG only object on the list that cannot be attributed to at least one industry that is present in all the survey areas covered by this analysis.

  • Expanded polystyrene is used as an exterior insulation envelope in the construction industry and is also used in packaging to protect fragile objects during transport.

  • Plastic production pellets are used to make plastic objects in the injection molding process.

  • Cotton swabs are often diverted to rivers and lakes after passing through a water treatment plant.

  • Industrial sheeting is used in horticulture, transport and the construction industry.

  • Construction plastics

Finding partners for these objects may involve an initial phase of informative targeted communication that calls attention to these study results, and current EU thresholds and baselines for beach litter.

20.7.2.2. Individual items

In cases such as the plastic production pellet (GPI), the use case of the item is definite and users and producers are relatively rare with respect to other litter items. Therefore the origin can be determined by searching for consumers of that product within 1500m of the survey location and then progressing upstream. This can be done by locating the survey locations on a map and then overlaying the locations of likely consumers or producers of the object in question.

A causal relationship could be inferred from the image below but not assumed. GPI are small and difficult to clean up once they have been spilled making the exact source impossible to determine. However it is reasonable to assume that the handlers and consumers of GPI will have the best ideas on how to prevent them from escaping into the environment.

output = io.BytesIO()
bassin_map = PILImage.open("resources/images/causality.jpeg")
bassin_map.thumbnail((800, 1200))
bassin_map.save(output, format='PNG')
encoded_string = base64.b64encode(output.getvalue()).decode()

html = '<img src="data:image/png;base64,{}"/>'.format(encoded_string)
IPython.display.HTML(html)

Identifying secondary sources of specific litter items. Consumers or handlers of plastic production pellets and probable fluvial route to survey location. Venoge and Thiele rivers.

Enterprise names withheld

This process can be repeated for any number of distinct items. Currently there are unique codes for various items that can be positively identified and have been found multiple times:

  • Pheromone baits for vineyards

  • Agricultural fencing

  • Electrical tape

  • Cable ties

  • String trimmer line

20.7.2.3. Sharing the responsibility

The principle of Extended Producer Responsibility (EPR) may provide the incentive to producers and consumers to account for the real costs of end-of-life management for the most common litter items identified in Switzerland. [Pou20]

A recent study in the Marine Policy journal identified several limitations of using preexisting beach litter survey data to assess the impact of EPR policy on observed litter quantities. [HLCM21]

  1. Limited data

  2. Heterogeneous methods

  3. Data not collected for the purposes of evaluating ERP

To correct these limitations the authors provide the following recommendations:

  1. Create a data framework specifically for monitoring ERP targets

  2. Identify sources

  3. Use litter counts to establish baselines

  4. Frequent monitoring

Litter counts mitigate the effects of light-weight packaging when collection results are based on weights. [HLCM21]

The IQAASl project addresses three out of the four recommendations and it introduced a method that allows stakeholders to add specific items to the survey protocol. Thus monitoring progress towards ERP targets on a national scale is as simple as defining the characteristics of the objects and creating an entry in the object database.

The issues of heterogeneous methods was solved by ensuring that the data collected in IQAASL was compatible with the data collected by the SLR. Specifically the method to aggregate data from one period to the next was established prior to the IQAASL sampling period. The methods were written in Python and are considered a product of the project.

The current database of beach-litter surveys in Switzerland includes over 1,500 samples collected using the same protocol with two distinct sampling periods of twelve months. Switzerland has all the elements in place to accurately estimate minimum probable values for the most common objects and evaluate stochastic changes on a monthly interval.

There are many improvements to be made concerning the national strategy:

  1. Defining a standardized reporting method for municipal, cantonal and federal stakeholders

  2. Define monitoring or assessment goals

  3. Formalize the data repository and the method for implementation at different administrative levels

  4. Develop a network of associations that share the responsibility and resources for surveying the territory

  5. Develop and implement a formal training program for surveyors

  6. Determine, in collaboration with academic partners, ideal sampling scenarios and research needs

  7. Develop a financing method to ensure that enough samples are collected each year in each survey area so that accurate assessments can be made and research requirements are met.



This project was made possible by the Swiss federal office for the environment.

Love what you do. ❤️

roger@hammerdirt.ch pushed the run button on 2021-09-20.
This document originates from https://github.com/hammerdirt-analyst/IQAASL-End-0f-Sampling-2021 all copyrights apply.

20.8. Annex

  1. surveyors

  2. municipalities, lakes, rivers

The average length and survey area for each location was used to replace the missing values for each record.

20.8.1. IQAASL surveyors

Hammerdirt staff:

  1. Shannon Erismann, field operations manager

  2. Helen Kurukulasuriya, surveyor

  3. Débora Carmo, surveyor

  4. Bettina Siegenthaler, surveyor

  5. Roger Erismann, surveyor

Participating organizations:

  1. Precious plastic leman

  2. Association pour la sauvetage du Léman

  3. Geneva international School

  4. Solid waste management: École polytechnique fédéral Lausanne

20.8.2. Municipalities, lakes and rivers with surveys

lakes = dfBeaches.loc[(dfBeaches.index.isin(fd_locs))&(dfBeaches.water == 'l')]['water_name'].unique()
rivers = dfBeaches.loc[(dfBeaches.index.isin(fd_locs))&(dfBeaches.water == 'r')]['water_name'].unique()
munis_joined = ', '.join(sorted(fd_pop_map.index))

muni_string = F"""**The municipalities in this report:**\n\n{munis_joined}
"""
md(muni_string)

The municipalities in this report:

Aarau, Allaman, Ascona, Beatenberg, Bellinzona, Bern, Biel/Bienne, Boudry, Bourg-en-Lavaux, Brienz (BE), Brissago, Brugg, Brügg, Burgdorf, Bönigen, Cheyres-Châbles, Châtel-Saint-Denis, Cudrefin, Dietikon, Erlach, Estavayer, Freienbach, Gals, Gambarogno, Gebenstorf, Genève, Gland, Glarus Nord, Grandson, Hauterive (NE), Hünenberg, Kallnach, Köniz, Küsnacht (ZH), La Tour-de-Peilz, Lausanne, Lavey-Morcles, Le Landeron, Leuk, Ligerz, Locarno, Lugano, Luterbach, Lüscherz, Merenschwand, Minusio, Montreux, Neuchâtel, Nidau, Port, Préverenges, Quarten, Rapperswil-Jona, Richterswil, Riddes, Rubigen, Saint-Gingolph, Saint-Sulpice (VD), Salgesch, Schmerikon, Sion, Solothurn, Spiez, Stäfa, Thun, Tolochenaz, Unterengstringen, Unterseen, Val-de-Travers, Versoix, Vevey, Vinelz, Walenstadt, Walperswil, Weesen, Weggis, Yverdon-les-Bains, Zug, Zürich

lakes_joined = ', '.join(sorted(lakes))

lake_string = F"""**The lakes in this report:**\n\n{lakes_joined}
"""
md(lake_string)

The lakes in this report:

Bielersee, Brienzersee, Lac Léman, Lago Maggiore, Lago di Lugano, Neuenburgersee, Quatre Cantons, Thunersee, Walensee, Zugersee, Zurichsee

The rivers in this report:

Aare, Aare|Nidau-Büren-Kanal, Alpes Valaisannes, Cassarate, Dorfbach, Emme, Escherkanal, Jona, La Thièle, Limmat, Linthkanal, Maggia, Massif du Jura, Préalpes Fribourgeoises, Reuss, Rhône, Schüss, Seez, Sihl, Ticino

obj_string = '{:,}'.format(fd_n_obj)
surv_string = '{:,}'.format(fd_n_samps)
pop_string = '{:,}'.format(int(fd_effected_population[0]))

date_quantity_context = F"For the period between {start_date[:-3]} and {end_date[:-3]}, {obj_string } objects were removed and identified in the course of {surv_string} surveys."
geo_context = F"Those surveys were conducted at {fd_n_locs} different locations."
admin_context = F"There are {fd_n_munis} different municipalities represented in these results with a combined population of approximately {pop_string}."
md(F"{date_quantity_context} {geo_context } {admin_context}")

For the period between 2020-03 and 2021-05, 54,997 objects were removed and identified in the course of 389 surveys. Those surveys were conducted at 146 different locations. There are 79 different municipalities represented in these results with a combined population of approximately 1,753,630.

20.8.3. Results of Speramans ranked correlation test